# load packages
library(readr)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(corrplot)
library(plotly)
library(reshape2)
library(car)
library(kableExtra)
library(broom)
library(knitr)
library(pROC)
library(ggpattern)
library(tidyr)
library(ResourceSelection)
library(sf)

Setting up Data

# import dataset
female <- read_csv("/Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/Data/wm.csv")
# get a glimpse of dataset
head(female)
## # A tibble: 6 × 458
##     HH1   HH2    LN   WM1   WM2   WM3 WMINT   WM4   WM5  WM6D  WM6M  WM6Y   WM8
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1     2     3     1     2     3    92    91    92    19    11  2020     2
## 2     1     4     2     1     4     2    92    91    92    18    11  2020     1
## 3     1     9     4     1     9     4    92    91    92    18    11  2020     1
## 4     1    10     4     1    10     4    92    91    92    18    11  2020     2
## 5     1    11     4     1    11     4    92    91    92    19    11  2020     2
## 6     1    11     5     1    11     5    92    91    92    18    11  2020     2
## # ℹ 445 more variables: WM9 <dbl>, WM17 <dbl>, WM7H <dbl>, WM7M <dbl>,
## #   WM10H <dbl>, WM10M <dbl>, WM11 <dbl>, WM12 <dbl>, WM13 <dbl>, WM14 <dbl>,
## #   WM15 <dbl>, WM22 <dbl>, WM23 <dbl>, WM24 <dbl>, WMHINT <dbl>, WMFIN <dbl>,
## #   WB3M <dbl>, WB3Y <dbl>, WB4 <dbl>, WB5 <dbl>, WB6A <dbl>, WB6B <dbl>,
## #   WB7 <dbl>, WB9 <lgl>, WB10A <lgl>, WB10B <lgl>, WB11 <lgl>, WB12A <lgl>,
## #   WB12B <lgl>, WB14 <dbl>, WB15 <dbl>, WB16 <dbl>, WB17 <dbl>, WB18 <dbl>,
## #   WB19A <chr>, WB19B <chr>, WB19C <chr>, WB19D <chr>, WB19E <chr>, …
# Select the specified columns to create a new dataframe
female_df <- select(female, WAGEM, MSTATUS, HH6, HH7, welevel, insurance, ethnicity, windex5, CP2, HA1, MT4, MT9, MT11)

# View the first few rows of the new dataframe
summary(female_df)
##      WAGEM          MSTATUS           HH6             HH7           welevel    
##  Min.   :10.00   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :0.00  
##  1st Qu.:18.00   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.000   1st Qu.:1.00  
##  Median :20.00   Median :1.000   Median :2.000   Median :3.000   Median :2.00  
##  Mean   :20.92   Mean   :1.413   Mean   :1.683   Mean   :3.404   Mean   :2.46  
##  3rd Qu.:23.00   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:5.000   3rd Qu.:3.00  
##  Max.   :47.00   Max.   :9.000   Max.   :2.000   Max.   :6.000   Max.   :9.00  
##  NA's   :2026    NA's   :11      NA's   :11      NA's   :11      NA's   :524   
##    insurance       ethnicity        windex5           CP2       
##  Min.   :1.000   Min.   :1.000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :1.000   Median :1.000   Median :2.000   Median :1.000  
##  Mean   :1.135   Mean   :2.035   Mean   :2.494   Mean   :1.431  
##  3rd Qu.:1.000   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :9.000   Max.   :6.000   Max.   :5.000   Max.   :9.000  
##  NA's   :524                                     NA's   :864    
##       HA1             MT4             MT9             MT11     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.00  
##  Median :1.000   Median :2.000   Median :1.000   Median :1.00  
##  Mean   :1.215   Mean   :1.664   Mean   :1.365   Mean   :1.12  
##  3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1.00  
##  Max.   :9.000   Max.   :9.000   Max.   :9.000   Max.   :9.00  
##  NA's   :524     NA's   :524     NA's   :2496    NA's   :524
# Rename the columns
female_df <- female_df %>%
  rename(
  age_first_marriage = WAGEM,
  marital_status = MSTATUS,
  area = HH6,
  region = HH7,
  education_level = welevel,
  health_insurance = insurance,
  ethnicity = ethnicity,
  wealth_index = windex5,
  current_contraceptive_use = CP2,
  awareness_hiv_aids = HA1,
  used_computer_tablet = MT4,
  used_internet = MT9,
  owns_mobile_phone = MT11
)
# Summarize missing values by column
summarize_missing <- sapply(female_df, function(x) sum(is.na(x)))
print(summarize_missing)
##        age_first_marriage            marital_status                      area 
##                      2026                        11                        11 
##                    region           education_level          health_insurance 
##                        11                       524                       524 
##                 ethnicity              wealth_index current_contraceptive_use 
##                         0                         0                       864 
##        awareness_hiv_aids      used_computer_tablet             used_internet 
##                       524                       524                      2496 
##         owns_mobile_phone 
##                       524
# Recode the value 9 to NA for specified variables
female_df <- female_df %>%
  mutate(
    current_contraceptive_use = na_if(current_contraceptive_use, 9),
    used_internet = na_if(used_internet, 9),
    health_insurance = na_if(health_insurance, 9),
    education_level = na_if(education_level, 9),
    awareness_hiv_aids = na_if(awareness_hiv_aids, 9),
    used_computer_tablet = na_if(used_computer_tablet, 9),
    owns_mobile_phone = na_if(owns_mobile_phone, 9),
    marital_status = na_if(marital_status, 9)
  )

# Recode the value 6 to NA for 'ethnicity'
female_df$ethnicity <- na_if(female_df$ethnicity, 6)

# Recode the value 0 to NA for 'wealth_index'
female_df$wealth_index <- na_if(female_df$wealth_index, 0)
# Recoding variables from 1 (Yes) and 2 (No) to 1 (Yes) and 0 (No)
female_df$health_insurance <- ifelse(female_df$health_insurance == 2, 0, female_df$health_insurance)
female_df$current_contraceptive_use <- ifelse(female_df$current_contraceptive_use == 2, 0, female_df$current_contraceptive_use)
female_df$awareness_hiv_aids <- ifelse(female_df$awareness_hiv_aids == 2, 0, female_df$awareness_hiv_aids)
female_df$used_computer_tablet <- ifelse(female_df$used_computer_tablet == 2, 0, female_df$used_computer_tablet)
female_df$owns_mobile_phone <- ifelse(female_df$owns_mobile_phone == 2, 0, female_df$owns_mobile_phone)
female_df$used_internet <- ifelse(female_df$used_internet == 2, 0, female_df$used_internet)
# View the changes to ensure NA substitution has been correctly applied
summary(female_df)
##  age_first_marriage marital_status       area           region     
##  Min.   :10.00      Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:18.00      1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.000  
##  Median :20.00      Median :1.000   Median :2.000   Median :3.000  
##  Mean   :20.92      Mean   :1.408   Mean   :1.683   Mean   :3.404  
##  3rd Qu.:23.00      3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:5.000  
##  Max.   :47.00      Max.   :3.000   Max.   :2.000   Max.   :6.000  
##  NA's   :2026       NA's   :18      NA's   :11      NA's   :11     
##  education_level health_insurance   ethnicity      wealth_index  
##  Min.   :0.00    Min.   :0.0000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.00    1st Qu.:1.0000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :2.00    Median :1.0000   Median :1.000   Median :2.000  
##  Mean   :2.46    Mean   :0.8659   Mean   :1.586   Mean   :2.615  
##  3rd Qu.:3.00    3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:4.000  
##  Max.   :5.00    Max.   :1.0000   Max.   :4.000   Max.   :5.000  
##  NA's   :525     NA's   :525      NA's   :1148    NA's   :524    
##  current_contraceptive_use awareness_hiv_aids used_computer_tablet
##  Min.   :0.0000            Min.   :0.0000     Min.   :0.0000      
##  1st Qu.:0.0000            1st Qu.:1.0000     1st Qu.:0.0000      
##  Median :1.0000            Median :1.0000     Median :0.0000      
##  Mean   :0.5842            Mean   :0.7975     Mean   :0.3412      
##  3rd Qu.:1.0000            3rd Qu.:1.0000     3rd Qu.:1.0000      
##  Max.   :1.0000            Max.   :1.0000     Max.   :1.0000      
##  NA's   :885               NA's   :541        NA's   :532         
##  used_internet    owns_mobile_phone
##  Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:0.0000   1st Qu.:1.0000   
##  Median :1.0000   Median :1.0000   
##  Mean   :0.6394   Mean   :0.8831   
##  3rd Qu.:1.0000   3rd Qu.:1.0000   
##  Max.   :1.0000   Max.   :1.0000   
##  NA's   :2501     NA's   :529

Creating New Variable Access to Media

# Combine individual variables for access to the internet, phone, and computer into a single variable
# This new variable "access_to_media" will have a value of 1 if the respondent has access to any of these media sources, and 0 if not
# This provides a more comprehensive measure of media access
female_df <- female_df %>%
  mutate(access_to_media = ifelse(used_computer_tablet == 1 | used_internet == 1 | owns_mobile_phone == 1, 1, 0))

# In later analysis, use access_to_media instead of the 3 separate variables
# Exporting female_df to a CSV file in the current working directory
#write.csv(female_df, "female_df.csv", row.names = FALSE)

Distributions of Data

# Histogram for 'Age at First Marriage'
afm_hist <- ggplot(female_df, aes(x = age_first_marriage)) +
  geom_histogram(fill = "#F7C0C8", color = "black") +
  theme_minimal() +
  ggtitle("Histogram of Age at First Marriage") +
  xlab("Age at First Marriage") +
  ylab("Frequency")

print(afm_hist)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2026 rows containing non-finite outside the scale range
## (`stat_bin()`).

Handling Missing Data

# Before that, let's double check the count of missing values by column
count_missing <- sapply(female_df, function(x) sum(is.na(x)))
print(count_missing)
##        age_first_marriage            marital_status                      area 
##                      2026                        18                        11 
##                    region           education_level          health_insurance 
##                        11                       525                       525 
##                 ethnicity              wealth_index current_contraceptive_use 
##                      1148                       524                       885 
##        awareness_hiv_aids      used_computer_tablet             used_internet 
##                       541                       532                      2501 
##         owns_mobile_phone           access_to_media 
##                       529                       529
# Make a copy of female_df for imputation
imputed_df <- female_df
# Handle missing values in 'Age at First Marriage'

# Calculate the median value for 'Age at First Marriage', excluding NA values
median_age_first_marriage <- median(imputed_df$age_first_marriage, na.rm = TRUE)

# Impute missing values in 'Age at First Marriage' with the median value
imputed_df$age_first_marriage[is.na(imputed_df$age_first_marriage)] <- median_age_first_marriage
# Define a function to calculate mode for categorical variables
getMode <- function(v) {
  # The mode is the value that appears most frequently in the data
  uniqv <- unique(na.omit(v))  # Omit NA values and get unique values
  uniqv[which.max(tabulate(match(v, uniqv)))]  # Return the value with the highest frequency
}
# For 'ethnicity', an ordinal variable with predefined categories, it makes sense to impute missing values with the mode.
mode_ethnicity <- getMode(imputed_df$ethnicity)
imputed_df <- mutate(imputed_df, ethnicity = ifelse(is.na(ethnicity), mode_ethnicity, ethnicity))
mode_area <- getMode(imputed_df$area)
imputed_df <- mutate(imputed_df, area = ifelse(is.na(area), mode_area, area))
mode_region <- getMode(imputed_df$region)
imputed_df <- mutate(imputed_df, region = ifelse(is.na(region), mode_region, region))
mode_marital_status <- getMode(imputed_df$marital_status)
imputed_df <- mutate(imputed_df, marital_status = ifelse(is.na(marital_status), mode_marital_status, marital_status))


# Binary variables like 'current_contraceptive_use', 'health_insurance','awareness_hiv_aids', 'used_internet', 'used_computer_tablet', 'owns_mobile_phone', and 'access_to_media'
# should be imputed with the mode since it represents the most frequent category (either 0 or 1).

# Calculate the mode for each binary variable
mode_used_internet <- getMode(imputed_df$used_internet)
mode_current_contraceptive_use <- getMode(imputed_df$current_contraceptive_use)
mode_health_insurance <- getMode(imputed_df$health_insurance)
mode_awareness_hiv_aids <- getMode(imputed_df$awareness_hiv_aids)
mode_used_computer_tablet <- getMode(imputed_df$used_computer_tablet)
mode_owns_mobile_phone <- getMode(imputed_df$owns_mobile_phone)
mode_access_to_media <- getMode(imputed_df$access_to_media)

# Impute missing values for binary variables
imputed_df <- mutate(imputed_df,
  used_internet = ifelse(is.na(used_internet), mode_used_internet, used_internet),
  current_contraceptive_use = ifelse(is.na(current_contraceptive_use), mode_current_contraceptive_use, current_contraceptive_use),
  health_insurance = ifelse(is.na(health_insurance), mode_health_insurance, health_insurance),
  awareness_hiv_aids = ifelse(is.na(awareness_hiv_aids), mode_awareness_hiv_aids, awareness_hiv_aids),
  used_computer_tablet = ifelse(is.na(used_computer_tablet), mode_used_computer_tablet, used_computer_tablet),
  owns_mobile_phone = ifelse(is.na(owns_mobile_phone), mode_owns_mobile_phone, owns_mobile_phone),
  access_to_media = ifelse(is.na(access_to_media), mode_access_to_media, access_to_media)
)

# 'education_level' is an ordinal variable where the median could be a more suitable measure of central tendency than the mode.
# However, given the categorical nature of the levels (e.g., "Primary", "Secondary"), using the mode may still be appropriate.
mode_education_level <- getMode(imputed_df$education_level)
imputed_df <- mutate(imputed_df, education_level = ifelse(is.na(education_level), mode_education_level, education_level))

# 'wealth_index' is an ordinal variable where the median could be a more suitable measure of central tendency than the mode.
# However, given the categorical nature of the levels (e.g., "Poorest", "Poor",...), using the mode may still be appropriate.
mode_wealth_index <- getMode(imputed_df$wealth_index)
imputed_df <- mutate(imputed_df, wealth_index = ifelse(is.na(wealth_index), mode_wealth_index, wealth_index))
# Check the resulting dataset to confirm changes
summary(imputed_df)
##  age_first_marriage marital_status       area           region     
##  Min.   :10.00      Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:18.00      1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.000  
##  Median :20.00      Median :1.000   Median :2.000   Median :3.000  
##  Mean   :20.76      Mean   :1.408   Mean   :1.684   Mean   :3.403  
##  3rd Qu.:23.00      3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:5.000  
##  Max.   :47.00      Max.   :3.000   Max.   :2.000   Max.   :6.000  
##  education_level health_insurance   ethnicity      wealth_index 
##  Min.   :0.000   Min.   :0.0000   Min.   :1.000   Min.   :1.00  
##  1st Qu.:1.000   1st Qu.:1.0000   1st Qu.:1.000   1st Qu.:1.00  
##  Median :2.000   Median :1.0000   Median :1.000   Median :2.00  
##  Mean   :2.438   Mean   :0.8721   Mean   :1.527   Mean   :2.54  
##  3rd Qu.:3.000   3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:4.00  
##  Max.   :5.000   Max.   :1.0000   Max.   :4.000   Max.   :5.00  
##  current_contraceptive_use awareness_hiv_aids used_computer_tablet
##  Min.   :0.0000            Min.   :0.0000     Min.   :0.0000      
##  1st Qu.:0.0000            1st Qu.:1.0000     1st Qu.:0.0000      
##  Median :1.0000            Median :1.0000     Median :0.0000      
##  Mean   :0.6168            Mean   :0.8072     Mean   :0.3251      
##  3rd Qu.:1.0000            3rd Qu.:1.0000     3rd Qu.:1.0000      
##  Max.   :1.0000            Max.   :1.0000     Max.   :1.0000      
##  used_internet    owns_mobile_phone access_to_media 
##  Min.   :0.0000   Min.   :0.0000    Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:1.0000    1st Qu.:1.0000  
##  Median :1.0000   Median :1.0000    Median :1.0000  
##  Mean   :0.7192   Mean   :0.8886    Mean   :0.9071  
##  3rd Qu.:1.0000   3rd Qu.:1.0000    3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000    Max.   :1.0000
# After imputation, let's check for missing values
count_imputation <- sapply(imputed_df, function(x) sum(is.na(x)))
print(count_imputation)
##        age_first_marriage            marital_status                      area 
##                         0                         0                         0 
##                    region           education_level          health_insurance 
##                         0                         0                         0 
##                 ethnicity              wealth_index current_contraceptive_use 
##                         0                         0                         0 
##        awareness_hiv_aids      used_computer_tablet             used_internet 
##                         0                         0                         0 
##         owns_mobile_phone           access_to_media 
##                         0                         0

Visualization Comparing Before vs. After Imputation

# Histogram for 'age_first_marriage' before imputation
afm_0 <- ggplot(female_df, aes(x = age_first_marriage)) + 
  geom_histogram(fill = "#F7C0C8", color = "black", bins = 30) +
  theme_light() +
  ggtitle("Before Imputation") +
  xlab("Age at First Marriage") +
  ylab("Frequency")

# Histogram for 'age_first_marriage' after imputation
afm_imputed <- ggplot(imputed_df, aes(x = age_first_marriage)) + 
  geom_histogram(fill = "#E83853", color = "black", bins = 30) +
  theme_light() +
  ggtitle("After Imputation") +
  xlab("Age at First Marriage") +
  ylab("Frequency")

# Arrange the two plots side by side
grid.arrange(afm_0, afm_imputed, ncol = 2)
## Warning: Removed 2026 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Arrange the two plots side by side and capture the layout as a grob
combined_plots <- arrangeGrob(afm_0, afm_imputed, ncol = 2)
## Warning: Removed 2026 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Now, use ggsave to save the combined plot
#ggsave("combined_age_first_marriage.png", plot = combined_plots, width = 10, height = 5)

Creating Binary Variables for Child Marriage Under 18 and 16

# Convert "age at first marriage" into a binary variable to indicate child marriage
# Child marriage is defined as marriage before the age of 18
# The new binary variable "child_marriage" will have a value of 1 if the marriage occurred before age 18, and 0 otherwise
imputed_df <- imputed_df %>%
  mutate(child_marriage = ifelse(age_first_marriage < 18, 1, 0))
# Create a binary variable for child marriage under 16
# The new variable "child_marriage_u16" will have a value of 1 if the marriage occurred before age 16, and 0 otherwise
imputed_df <- imputed_df %>%
  mutate(child_marriage_u16 = ifelse(age_first_marriage < 16, 1, 0))
# Move "child_marriage" and "child_marriage_u16" to the front of the dataframe
imputed_df <- imputed_df %>%
  select(child_marriage, child_marriage_u16, everything())
# Exporting female_df to a CSV file in the current working directory
#write.csv(imputed_df, "imputed_df.csv", row.names = FALSE)

EDA

Vietnam Map with Regions and Their Percentages of Child Marriage (under 18)

# Aggregate the data by region to get the total number of child marriages under 18 per region
region_counts <- aggregate(child_marriage ~ region, data = imputed_df, FUN = sum)

# Calculate the total number of child marriages under 18 in the dataset
total_child_marriages <- sum(region_counts$child_marriage)

# Calculate the percentage for each region
region_counts$married_u18_perc_of_total <- (region_counts$child_marriage / total_child_marriages) * 100
# Mapping region numbers to names
region_names <- c("Red River Delta", "Northern Midlands And Mountain", 
                  "North Central And Central Coastal", "Central Highlands", 
                  "South East", "Mekong River Delta")
names(region_counts)[1] <- "region_name"
region_counts$region_name <- factor(region_counts$region_name, levels = 1:6, labels = region_names)

# Display the final data frame
print(region_counts)
##                         region_name child_marriage married_u18_perc_of_total
## 1                   Red River Delta            142                  7.226463
## 2    Northern Midlands And Mountain            888                 45.190840
## 3 North Central And Central Coastal            215                 10.941476
## 4                 Central Highlands            271                 13.791349
## 5                        South East            194                  9.872774
## 6                Mekong River Delta            255                 12.977099
# Updated mapping including all provinces and cities in the Red River Delta
province_to_region <- data.frame(
  NAME_1 = c(
    'Bắc Ninh', 'Hà Nam', 'Hà Nội', 'Hải Dương', 'Hải Phòng', 'Hoà Bình', 'Hưng Yên', 'Nam Định', 'Ninh Bình', 'Thái Bình', 'Vĩnh Phúc', # Red River Delta 11
    
    'Bắc Giang', 'Bắc Kạn', 'Cao Bằng', 'Hà Giang', 'Lạng Sơn', 'Lào Cai', 'Phú Thọ', 'Quảng Ninh', 'Thái Nguyên', 'Tuyên Quang', 'Yên Bái', 'Điện Biên', 'Lai Châu', 'Sơn La', # Northern Midlands And Mountain 14
    
    'Bình Định', 'Bình Thuận', 'Khánh Hòa', 'Ninh Thuận', 'Phú Yên', 'Quảng Nam', 'Quảng Ngãi', 'Thừa Thiên Huế', 'Đà Nẵng', 'Hà Tĩnh', 'Nghệ An', 'Quảng Bình', 'Quảng Trị', 'Thanh Hóa', # North Central And Central Coastal 14
    
    'Đắk Lắk', 'Đắk Nông', 'Gia Lai', 'Kon Tum', 'Lâm Đồng', # Central Highlands 5
    
    'Bà Rịa - Vũng Tàu', 'Bình Dương', 'Bình Phước', 'Đồng Nai', 'Hồ Chí Minh', 'Tây Ninh', # South East 6
    
    'An Giang', 'Bạc Liêu', 'Bến Tre', 'Cà Mau', 'Cần Thơ', 'Đồng Tháp', 'Hậu Giang', 'Kiên Giang', 'Long An', 'Sóc Trăng', 'Tiền Giang', 'Trà Vinh', 'Vĩnh Long' # Mekong River Delta 13
  ),
  Region = c(
    rep('Red River Delta', 11), 
    rep('Northern Midlands And Mountain', 14), 
    rep('North Central And Central Coastal', 14), 
    rep('Central Highlands', 5), 
    rep('South East', 6), 
    rep('Mekong River Delta', 13)
  )
)
# Check the mapping
#print(province_to_region)
# Read shapefile
vietnam_shape <- st_read('/Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/gadm41_VNM_shp')
## Multiple layers are present in data source /Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/gadm41_VNM_shp, reading layer `gadm41_VNM_2'.
## Use `st_layers' to list all layer names and their type in a data source.
## Set the `layer' argument in `st_read' to read a particular layer.
## Reading layer `gadm41_VNM_2' from data source 
##   `/Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/gadm41_VNM_shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 710 features and 13 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 102.1446 ymin: 8.381355 xmax: 109.4692 ymax: 23.39269
## Geodetic CRS:  WGS 84
# Join the shapefile with the province-to-region mapping
vietnam_shape_with_region <- vietnam_shape %>%
  left_join(province_to_region, by = "NAME_1")

# Aggregate the shapefile data by region
vietnam_regions <- vietnam_shape_with_region %>%
  group_by(Region) %>%
  summarise(geometry = st_union(geometry), .groups = 'drop')
# Join the aggregated shapefile data with the child marriage data
vietnam_map_data <- vietnam_regions %>%
  left_join(region_counts, by = c("Region" = "region_name"))

# Define colors with your specific choices
colors_ordered <- setNames(c("#B20033", "#CD0A25", "#E83853", "#EF7D8D", "#F7C0C8", "#FBE1E5"),
                           c("Northern Midlands And Mountain", "Central Highlands", 
                             "Mekong River Delta", "North Central And Central Coastal", 
                             "South East", "Red River Delta"))

# Plot
mapvn <- ggplot(data = vietnam_map_data) +
  geom_sf(aes(fill = factor(Region, levels = names(colors_ordered))), color = NA) +
  geom_sf_text(aes(label = sprintf("%.1f%%", married_u18_perc_of_total)), size = 4, hjust = 0.5, vjust = 0.5) +
  scale_fill_manual(values = colors_ordered, name = "Region") +
  labs(title = "Regional Contribution of Female Child Marriage Rates to Total Rates in Vietnam") +
  theme_void() +
  theme(legend.position = "right")

print(mapvn)
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data

#ggsave("mapvn.png", plot = mapvn, width = 8, height = 6, dpi = 300)

Marital Status Among Female Respondants by Region in Vietnam

# Calculate total observations
total_obs <- nrow(imputed_df)

# Aggregate counts for each category by region without altering the original 'region' field
counts_df <- aggregate(cbind(ever_married = imputed_df$marital_status %in% c(1, 2), 
                             married_u18 = imputed_df$child_marriage == 1, 
                             married_u16 = imputed_df$child_marriage_u16 == 1) ~ region, 
                       data = imputed_df, 
                       FUN = sum)

# Convert counts to percentages
counts_df$ever_married <- (counts_df$ever_married / total_obs) * 100
counts_df$married_u18 <- (counts_df$married_u18 / total_obs) * 100
counts_df$married_u16 <- (counts_df$married_u16 / total_obs) * 100
p <- ggplot(counts_df, aes(x = factor(region))) +
  geom_bar(aes(y = ever_married, fill = "Ever married"), stat = "identity") +
  geom_bar(aes(y = married_u18, fill = "Married before 18 years old"), stat = "identity") +
  geom_bar(aes(y = married_u16, fill = "Married before 16 years old"), stat = "identity") +
  # Adding text labels for ever_married
  geom_text(aes(y = ever_married, label = sprintf("%.1f%%", ever_married)), 
            position = position_stack(vjust = 1.025), 
            size = 4, color = "black") +
  # Adding text labels for married_u18
  geom_text(aes(y = married_u18, label = sprintf("%.1f%%", married_u18)), 
            position = position_stack(vjust = 1.05), 
            size = 4, color = "black") +
  # Adding text labels for married_u16
  geom_text(aes(y = married_u16, label = sprintf("%.1f%%", married_u16)), 
            position = position_stack(vjust = 1.1), 
            size = 4, color = "black") +
  scale_fill_manual(values = c("Ever married" = "#FBE1E5", 
                               "Married before 18 years old" = "#EF7D8D", 
                               "Married before 16 years old" = "#B20016"),
                    name = "Marital Status") +
  labs(x = "Region", y = "Percentage", title = "Marital Status Among Female Respondants by Region") +
  theme_minimal() +
  theme(plot.margin = margin(t = 10, r = 10, b = 10, l = 10, unit = "mm"), 
        axis.text.x = element_text(angle = 45, hjust = 1), 
        legend.position = "right") +
  scale_x_discrete(labels = c("1" = "Red River Delta", "2" = "Northern Midlands And Mountain", 
                              "3" = "North Central And Central Coastal", "4" = "Central Highlands", 
                              "5" = "South East", "6" = "Mekong River Delta"))

# Convert ggplot object to plotly for interactive visualization
ggplotly(p)

Preparing for Regression Analysis

Correlation Matrix

# Calculate correlation matrix
cor_matrix <- cor(imputed_df %>% select_if(is.numeric), use = "complete.obs")

# Melt the correlation matrix
melted_cor_matrix <- melt(cor_matrix)
# Generate an interactive heatmap
corr_matrix <- ggplot(melted_cor_matrix, aes(Var1, Var2, fill = value)) +
    geom_tile() +
    scale_fill_gradientn(
        colours = c("deepskyblue", "white", "#CD0A25"),
        values = scales::rescale(c(-1, 0, 1)),
        limits = c(-1, 1),
        name="Pearson\nCorrelation"
    ) +
    theme_minimal() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    xlab("") + 
    ylab("") +
    ggtitle("Correlation Matrix") 

# Convert ggplot object to plotly for interactivity
ggplotly(corr_matrix)

Converting Categorical and Binary Variables to Factors

# Convert nominal and ordinal variables to factors
imputed_df$area <- as.factor(imputed_df$area)
imputed_df$region <- as.factor(imputed_df$region)
imputed_df$education_level <- factor(imputed_df$education_level, ordered = FALSE)
imputed_df$ethnicity <- as.factor(imputed_df$ethnicity)
imputed_df$wealth_index <- factor(imputed_df$wealth_index, ordered = FALSE)

# Binary variables are already in the correct format and can be used as is

Base Logistic Regression Model

# Baseline model for reference
baseline_model <- glm(child_marriage ~ area + education_level + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, family = binomial(), data = imputed_df)

# Summarize the baseline model
summary(baseline_model)
## 
## Call:
## glm(formula = child_marriage ~ area + education_level + wealth_index + 
##     health_insurance + current_contraceptive_use + awareness_hiv_aids + 
##     access_to_media, family = binomial(), data = imputed_df)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -0.73391    0.13169  -5.573 2.50e-08 ***
## area2                      0.45543    0.07999   5.693 1.25e-08 ***
## education_level1          -0.41350    0.08705  -4.750 2.03e-06 ***
## education_level2          -0.58693    0.08518  -6.890 5.57e-12 ***
## education_level3          -1.52807    0.11502 -13.286  < 2e-16 ***
## education_level4          -3.24812    0.51198  -6.344 2.24e-10 ***
## education_level5          -3.19482    0.25323 -12.616  < 2e-16 ***
## wealth_index2             -0.46183    0.07994  -5.777 7.61e-09 ***
## wealth_index3             -0.80552    0.09964  -8.085 6.24e-16 ***
## wealth_index4             -0.61389    0.10926  -5.618 1.93e-08 ***
## wealth_index5             -0.89341    0.14898  -5.997 2.01e-09 ***
## health_insurance           0.06907    0.08094   0.853    0.393    
## current_contraceptive_use  0.40123    0.06065   6.615 3.71e-11 ***
## awareness_hiv_aids        -0.47875    0.06960  -6.879 6.05e-12 ***
## access_to_media           -0.01461    0.08121  -0.180    0.857    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10439.1  on 11293  degrees of freedom
## Residual deviance:  8493.7  on 11279  degrees of freedom
## AIC: 8523.7
## 
## Number of Fisher Scoring iterations: 7

Accessing Base Model’s Multicollinearity

# Calculate VIF (A VIF value > 5 indicates high multicollinearity)
base_vif_results <- vif(baseline_model)
print(base_vif_results)
##                               GVIF Df GVIF^(1/(2*Df))
## area                      1.110732  1        1.053913
## education_level           1.707863  5        1.054983
## wealth_index              1.401192  4        1.043067
## health_insurance          1.021120  1        1.010505
## current_contraceptive_use 1.034853  1        1.017277
## awareness_hiv_aids        1.500511  1        1.224954
## access_to_media           1.268704  1        1.126367
# Check if any VIF value is greater than a typical threshold, like 5 or 10.
base_high_vif <- base_vif_results[base_vif_results > 5]
print(base_high_vif)
## numeric(0)

Adding Fixed Effects to Base Model

# Enhanced model with additional fixed effects
enhanced_model <- glm(child_marriage ~ area + region + education_level + ethnicity + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, 
                      family = binomial(), 
                      data = imputed_df)

# Summarize the new model with FEs
summary(enhanced_model)
## 
## Call:
## glm(formula = child_marriage ~ area + region + education_level + 
##     ethnicity + wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media, family = binomial(), 
##     data = imputed_df)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.54522    0.17906  -8.630  < 2e-16 ***
## area2                      0.28689    0.08488   3.380 0.000725 ***
## region2                    0.37758    0.12919   2.923 0.003469 ** 
## region3                    0.18170    0.12745   1.426 0.153967    
## region4                    0.34057    0.12599   2.703 0.006869 ** 
## region5                   -0.05340    0.12612  -0.423 0.672028    
## region6                   -0.05277    0.13336  -0.396 0.692301    
## education_level1          -0.05925    0.09353  -0.634 0.526382    
## education_level2          -0.22571    0.09227  -2.446 0.014438 *  
## education_level3          -1.17196    0.12172  -9.629  < 2e-16 ***
## education_level4          -2.96956    0.51385  -5.779 7.51e-09 ***
## education_level5          -2.89574    0.25599 -11.312  < 2e-16 ***
## ethnicity2                 0.07771    0.10592   0.734 0.463184    
## ethnicity3                 0.27807    0.12830   2.167 0.030210 *  
## ethnicity4                 1.15504    0.10626  10.870  < 2e-16 ***
## wealth_index2             -0.12498    0.08523  -1.466 0.142533    
## wealth_index3             -0.46330    0.10565  -4.385 1.16e-05 ***
## wealth_index4             -0.26442    0.11695  -2.261 0.023760 *  
## wealth_index5             -0.54064    0.16030  -3.373 0.000744 ***
## health_insurance          -0.05157    0.08277  -0.623 0.533244    
## current_contraceptive_use  0.48480    0.06265   7.739 1.00e-14 ***
## awareness_hiv_aids        -0.19387    0.07592  -2.554 0.010661 *  
## access_to_media           -0.07880    0.08517  -0.925 0.354845    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10439.1  on 11293  degrees of freedom
## Residual deviance:  8240.2  on 11271  degrees of freedom
## AIC: 8286.2
## 
## Number of Fisher Scoring iterations: 7

Model with FEs’ Multicollinearity

# Calculate VIF (A VIF value > 5 indicates high multicollinearity)
FE_vif_results <- vif(enhanced_model)
print(FE_vif_results)
##                               GVIF Df GVIF^(1/(2*Df))
## area                      1.254248  1        1.119932
## region                    4.916245  5        1.172636
## education_level           1.956030  5        1.069394
## ethnicity                 4.235677  3        1.272000
## wealth_index              1.934925  4        1.086008
## health_insurance          1.051524  1        1.025438
## current_contraceptive_use 1.049061  1        1.024237
## awareness_hiv_aids        1.679129  1        1.295812
## access_to_media           1.312005  1        1.145428
# Check if any VIF value is greater than a typical threshold, like 5.
FE_high_vif <- FE_vif_results[FE_vif_results > 5]
print(FE_high_vif)
## numeric(0)

No VIF is significant larger than 5. This suggests that the independent variables in my model with Fixed Effects do not suffer from severe multicollinearity.

Assessing AICs and BICs of Base Model vs Fixed Effects Model

# Model comparison using AIC
aic_base <- AIC(baseline_model)
aic_enhanced <- AIC(enhanced_model)
cat("AIC - Base Model:", aic_base, "\n")
## AIC - Base Model: 8523.675
cat("AIC - Enhanced Model:", aic_enhanced, "\n")
## AIC - Enhanced Model: 8286.168

Additionally, the lower AIC for the model with two fixed effects compared to the baseline model suggests that adding these fixed effects improves the model’s overall fit. This aligns with my theoretical justification for including region and ethnicity as relevant factors in predicting child marriage.

# Model comparison using BIC
bic_base <- BIC(baseline_model)
bic_enhanced <- BIC(enhanced_model)
cat("BIC - Base Model:", bic_base, "\n")
## BIC - Base Model: 8633.655
cat("BIC - Enhanced Model:", bic_enhanced, "\n")
## BIC - Enhanced Model: 8454.805

Lower BIC values suggest that, despite the added complexity (more parameters), the model with both region and ethnicity provides a better overall fit to my data when adjusted for the number of predictors. This is a strong indication that these variables are meaningful in explaining the variance in child marriage occurrences in my dataset.

Model with Interaction Terms (Area:Wealth_Index)

# Adding the interaction term between education level and wealth index
model_interaction <- update(enhanced_model, . ~ . + area:wealth_index)
summary(model_interaction)
## 
## Call:
## glm(formula = child_marriage ~ area + region + education_level + 
##     ethnicity + wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media + area:wealth_index, 
##     family = binomial(), data = imputed_df)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.78525    0.21262  -8.396  < 2e-16 ***
## area2                      0.54558    0.15257   3.576 0.000349 ***
## region2                    0.36523    0.13009   2.807 0.004994 ** 
## region3                    0.16778    0.12857   1.305 0.191877    
## region4                    0.32337    0.12701   2.546 0.010892 *  
## region5                   -0.06803    0.12726  -0.535 0.592956    
## region6                   -0.06059    0.13410  -0.452 0.651371    
## education_level1          -0.06117    0.09352  -0.654 0.513085    
## education_level2          -0.21895    0.09231  -2.372 0.017701 *  
## education_level3          -1.17282    0.12176  -9.632  < 2e-16 ***
## education_level4          -2.96880    0.51416  -5.774 7.74e-09 ***
## education_level5          -2.89550    0.25820 -11.214  < 2e-16 ***
## ethnicity2                 0.06171    0.10624   0.581 0.561352    
## ethnicity3                 0.27262    0.12824   2.126 0.033518 *  
## ethnicity4                 1.12852    0.10687  10.559  < 2e-16 ***
## wealth_index2              0.26715    0.19445   1.374 0.169473    
## wealth_index3             -0.15465    0.21122  -0.732 0.464075    
## wealth_index4             -0.04152    0.22195  -0.187 0.851622    
## wealth_index5             -0.32837    0.26777  -1.226 0.220078    
## health_insurance          -0.04238    0.08289  -0.511 0.609164    
## current_contraceptive_use  0.49215    0.06274   7.844 4.35e-15 ***
## awareness_hiv_aids        -0.18696    0.07599  -2.460 0.013883 *  
## access_to_media           -0.07199    0.08522  -0.845 0.398196    
## area2:wealth_index2       -0.47937    0.21523  -2.227 0.025928 *  
## area2:wealth_index3       -0.38304    0.24189  -1.584 0.113299    
## area2:wealth_index4       -0.26224    0.25631  -1.023 0.306247    
## area2:wealth_index5       -0.25049    0.31993  -0.783 0.433646    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10439.1  on 11293  degrees of freedom
## Residual deviance:  8234.7  on 11267  degrees of freedom
## AIC: 8288.7
## 
## Number of Fisher Scoring iterations: 7

Assessing AICs and BICs of Fixed Effects Model vs Fixed Effects Model w/ Interaction Terms

cat("AIC (enhanced):", AIC(enhanced_model), "\nAIC (w/ interaction terms):", AIC(model_interaction), "\n")
## AIC (enhanced): 8286.168 
## AIC (w/ interaction terms): 8288.662
cat("BIC (enhanced):", BIC(enhanced_model), "\nBIC (w/ interaction terms):", BIC(model_interaction), "\n")
## BIC (enhanced): 8454.805 
## BIC (w/ interaction terms): 8486.627
  • The AIC and BIC results for my two models – the enhanced_model and model_interaction – indicate that the simpler model (without interaction terms) has a better trade-off between model fit and complexity.
  • The AIC is lower for the enhanced_model compared to the model_interaction. This suggests that, despite possibly having a good fit, the addition of interaction terms to the model_interaction does not improve the model’s performance enough to justify the added complexity. A lower AIC value is generally preferred, implying that the enhanced_model is a better model choice from an information criterion standpoint.
  • The BIC, like the AIC, is also lower for the enhanced_model. BIC, which includes a stricter penalty for the number of parameters, reinforces the conclusion drawn from the AIC: the additional complexity introduced by interaction terms does not lead to a proportional improvement in the model fit. BIC being more stringent, particularly in models with larger sample sizes and more parameters, suggests a strong preference for the simpler enhanced_model.
  • Both AIC and BIC results point towards the enhanced_model as a more optimal balance between goodness of fit and parsimony. This indicates that, while interaction terms add detail and complexity to the model, they might not be contributing significantly to explaining the variability in your dependent variable.

Logistic Regression Models Comparison (Odd Ratios and 95% Confidence Intervals)

# Function to add significance asterisks
add_asterisks <- function(p_value) {
  if (is.na(p_value)) {
    return(NA)
  } else if (p_value < 0.001) {
    return("***")
  } else if (p_value < 0.01) {
    return("**")
  } else if (p_value < 0.05) {
    return("*")
  } else {
    return("")
  }
}
# Function to format confidence intervals as a string
format_ci <- function(lower, upper) {
  paste0("(", round(lower, 2), ", ", round(upper, 2), ")")
}
# Tidy the baseline model with confidence intervals
tidy_baseline <- tidy(baseline_model, conf.int = TRUE, exponentiate = TRUE)

# Tidy the enhanced model with confidence intervals
tidy_enhanced <- tidy(enhanced_model, conf.int = TRUE, exponentiate = TRUE)

# Tidy the interaction model with confidence intervals
tidy_interaction <- tidy(model_interaction, conf.int = TRUE, exponentiate = TRUE)
# Apply the function to each model's p.value
tidy_baseline$asterisks <- sapply(tidy_baseline$p.value, add_asterisks)
tidy_enhanced$asterisks <- sapply(tidy_enhanced$p.value, add_asterisks)
tidy_interaction$asterisks <- sapply(tidy_interaction$p.value, add_asterisks)
# Create OR strings with asterisks and format CIs as a string
tidy_baseline <- tidy_baseline %>%
  mutate(
    OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
    CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
  )

tidy_enhanced <- tidy_enhanced %>%
  mutate(
    OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
    CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
  )

tidy_interaction <- tidy_interaction %>%
  mutate(
    OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
    CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
  )
# Add a 'Model' column to each tidied dataframe
tidy_baseline <- tidy_baseline %>% mutate(Model = "Baseline")
tidy_enhanced <- tidy_enhanced %>% mutate(Model = "Enhanced")
tidy_interaction <- tidy_interaction %>% mutate(Model = "Interaction")

# Combine and pivot the dataframes
combined_results <- bind_rows(
  tidy_baseline %>% select(term, OR, CI, Model),
  tidy_enhanced %>% select(term, OR, CI, Model),
  tidy_interaction %>% select(term, OR, CI, Model)
) %>%
  pivot_wider(names_from = Model, values_from = c(OR, CI))
# Replace NAs with "—"
combined_results[is.na(combined_results)] <- "—"
# Reordering columns to have OR and CI next to each other for each model
combined_results <- combined_results %>%
  select(term, 
         OR_Baseline, CI_Baseline, 
         OR_Enhanced, CI_Enhanced, 
         OR_Interaction, CI_Interaction)
# Print the final combined table
print(combined_results)
## # A tibble: 27 × 7
##    term           OR_Baseline CI_Baseline OR_Enhanced CI_Enhanced OR_Interaction
##    <chr>          <chr>       <chr>       <chr>       <chr>       <chr>         
##  1 (Intercept)    0.48***     (0.37, 0.6… 0.21***     (0.15, 0.3) 0.17***       
##  2 area2          1.58***     (1.35, 1.8… 1.33***     (1.13, 1.5… 1.73***       
##  3 education_lev… 0.66***     (0.56, 0.7… 0.94        (0.78, 1.1… 0.94          
##  4 education_lev… 0.56***     (0.47, 0.6… 0.8*        (0.67, 0.9… 0.8*          
##  5 education_lev… 0.22***     (0.17, 0.2… 0.31***     (0.24, 0.3… 0.31***       
##  6 education_lev… 0.04***     (0.01, 0.0… 0.05***     (0.02, 0.1… 0.05***       
##  7 education_lev… 0.04***     (0.02, 0.0… 0.06***     (0.03, 0.0… 0.06***       
##  8 wealth_index2  0.63***     (0.54, 0.7… 0.88        (0.75, 1.0… 1.31          
##  9 wealth_index3  0.45***     (0.37, 0.5… 0.63***     (0.51, 0.7… 0.86          
## 10 wealth_index4  0.54***     (0.44, 0.6… 0.77*       (0.61, 0.9… 0.96          
## # ℹ 17 more rows
## # ℹ 1 more variable: CI_Interaction <chr>

Models Validation

Hosmer-Lemeshow Test

# 1. Hosmer-Lemeshow Test for the Baseline Model
hoslem.test(baseline_model$y, fitted(baseline_model), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  baseline_model$y, fitted(baseline_model)
## X-squared = 10.565, df = 8, p-value = 0.2276
# 2. Hosmer-Lemeshow Test for the Enhanced Model (Baseline + Fixed Effects)
hoslem.test(enhanced_model$y, fitted(enhanced_model), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  enhanced_model$y, fitted(enhanced_model)
## X-squared = 10.538, df = 8, p-value = 0.2293
# 3. Hosmer-Lemeshow Test for the Interaction Model (Baseline + Fixed Effects + Interaction Terms)
hoslem.test(model_interaction$y, fitted(model_interaction), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  model_interaction$y, fitted(model_interaction)
## X-squared = 10.272, df = 8, p-value = 0.2465

A large p-value (>0.05) indicates a good fit, meaning that there’s no significant difference between the observed and predicted values. Through each model, the p-value increases which suggests that our decision to include fixed effects and interaction terms are significant.

Likelihood Ratio Test (ANOVA)

# Baseline vs. Baseline + Fixed Effects
anova(baseline_model, enhanced_model, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: child_marriage ~ area + education_level + wealth_index + health_insurance + 
##     current_contraceptive_use + awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity + 
##     wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1     11279     8493.7                          
## 2     11271     8240.2  8   253.51 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The Enhanced Model provides a significantly better fit to the data compared to the Baseline Model, as indicated by the large decrease in residual deviance and the very small p-value (< 2.2e-16).

# Baseline + Fixed Effects vs. Baseline + Fixed Effects + Interaction Terms
anova(enhanced_model, model_interaction, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: child_marriage ~ area + region + education_level + ethnicity + 
##     wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity + 
##     wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media + area:wealth_index
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1     11271     8240.2                     
## 2     11267     8234.7  4    5.506   0.2392

Adding the interaction terms between area and wealth_index does not significantly improve the model fit compared to the Enhanced Model without interaction terms. This is indicated by the relatively high p-value and the smaller decrease in residual deviance.

ROC Curve and AUC

# For each model
roc_response_baseline <- roc(imputed_df$child_marriage, fitted(baseline_model))
auc_baseline <- auc(roc_response_baseline)

roc_response_enhanced <- roc(imputed_df$child_marriage, fitted(enhanced_model))
auc_enhanced <- auc(roc_response_enhanced)

roc_response_interaction <- roc(imputed_df$child_marriage, fitted(model_interaction))
auc_interaction <- auc(roc_response_interaction)

# Compare AUC values
print(paste("AUC Baseline Model:", auc_baseline))
## [1] "AUC Baseline Model: 0.794541604239919"
print(paste("AUC Enhanced Model:", auc_enhanced))
## [1] "AUC Enhanced Model: 0.809330968003957"
print(paste("AUC Interaction Model:", auc_interaction))
## [1] "AUC Interaction Model: 0.80987822317723"
  1. Baseline Model (AUC: 0.7945):

The AUC value is close to 0.8, which indicates that the Baseline Model has good discriminative ability. In other words, it is capable of distinguishing between cases and controls with a high degree of accuracy. An AUC of 0.5 represents a model with no discriminative ability (akin to random guessing), while an AUC of 1.0 represents perfect discrimination. So, my model is performing substantially better than random guessing.

  1. Enhanced Model (AUC: 0.8093):

This model shows a slight improvement in AUC over the Baseline Model. The increase suggests that the additional variables (or adjustments) you made in the Enhanced Model contribute positively to its ability to differentiate between cases and controls. The difference in AUC between the Baseline and Enhanced models, while modest, is still meaningful, especially in practical, real-world contexts.

  1. Interaction Model (AUC: 0.8099):

The Interaction Model shows a very slight improvement in AUC over the Enhanced Model. This indicates that adding interaction terms provides a marginal improvement in the model’s discriminatory power. However, the improvement is very minimal, which aligns with my earlier findings that the interaction terms did not significantly improve the model fit.

  1. Overall:

All models demonstrate good ability to distinguish between cases and controls. An AUC greater than 0.7 is generally considered acceptable, and my models are around or above 0.8. The Enhanced and Interaction Models only show marginal improvements in AUC compared to the Baseline Model. This suggests that while the additional complexity (more variables and interaction terms) does contribute slightly to model performance, the gains are not substantial. Given the slight increases in AUC with added model complexity, consider the trade-offs. A simpler model might be preferable if it is easier to interpret and communicate, especially if the increase in predictive power is minimal. Personally, given this result, I think Enhanced Model might be a better-suited model overall.

Gut Check: Redo Analysis with Actual Data (no imputation)

Recreate the dataframe with removed missing data

# No imputation -- Remove rows with any missing data from 'female_df'
og_df <- na.omit(female_df)
head(og_df)
## # A tibble: 6 × 14
##   age_first_marriage marital_status  area region education_level
##                <dbl>          <dbl> <dbl>  <dbl>           <dbl>
## 1                 17              1     1      1               2
## 2                 22              1     1      1               5
## 3                 26              1     1      1               5
## 4                 21              1     1      1               3
## 5                 18              1     1      1               1
## 6                 22              1     1      1               4
## # ℹ 9 more variables: health_insurance <dbl>, ethnicity <dbl>,
## #   wealth_index <dbl>, current_contraceptive_use <dbl>,
## #   awareness_hiv_aids <dbl>, used_computer_tablet <dbl>, used_internet <dbl>,
## #   owns_mobile_phone <dbl>, access_to_media <dbl>

Recreate necessary variables

# Convert "age at first marriage" into a binary variable to indicate child marriage
# Child marriage is defined as marriage before the age of 18
# The new binary variable "child_marriage" will have a value of 1 if the marriage occurred before age 18, and 0 otherwise
og_df <- og_df %>%
  mutate(child_marriage = ifelse(age_first_marriage < 18, 1, 0))
# Create a binary variable for child marriage under 16
# The new variable "child_marriage_u16" will have a value of 1 if the marriage occurred before age 16, and 0 otherwise
og_df <- og_df %>%
  mutate(child_marriage_u16 = ifelse(age_first_marriage < 16, 1, 0))
# Move "child_marriage" and "child_marriage_u16" to the front of the dataframe
og_df <- og_df %>%
  select(child_marriage, child_marriage_u16, everything())

Converting Categorical and Binary Variables to Factors

# Convert nominal and ordinal variables to factors
og_df$area <- as.factor(og_df$area)
og_df$region <- as.factor(og_df$region)
og_df$education_level <- factor(og_df$education_level, ordered = FALSE)
og_df$ethnicity <- as.factor(og_df$ethnicity)
og_df$wealth_index <- factor(og_df$wealth_index, ordered = FALSE)

# Binary variables are already in the correct format and can be used as is

Recreate the regression models without imputed data

# Base model
base_model_no_impute <- glm(child_marriage ~ area + education_level + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, family = binomial(), data = og_df)

# Summarize the base model
summary(base_model_no_impute)
## 
## Call:
## glm(formula = child_marriage ~ area + education_level + wealth_index + 
##     health_insurance + current_contraceptive_use + awareness_hiv_aids + 
##     access_to_media, family = binomial(), data = og_df)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -0.12852    0.15839  -0.811  0.41714    
## area2                      0.27195    0.09515   2.858  0.00426 ** 
## education_level1          -0.44184    0.10273  -4.301 1.70e-05 ***
## education_level2          -0.51162    0.10336  -4.950 7.43e-07 ***
## education_level3          -1.27268    0.13835  -9.199  < 2e-16 ***
## education_level4          -3.39394    0.71961  -4.716 2.40e-06 ***
## education_level5          -3.17824    0.46414  -6.848 7.51e-12 ***
## wealth_index2             -0.61389    0.09307  -6.596 4.22e-11 ***
## wealth_index3             -0.92891    0.11199  -8.295  < 2e-16 ***
## wealth_index4             -0.75155    0.12357  -6.082 1.19e-09 ***
## wealth_index5             -1.02338    0.16941  -6.041 1.53e-09 ***
## health_insurance           0.10983    0.09266   1.185  0.23587    
## current_contraceptive_use -0.10949    0.07039  -1.555  0.11984    
## awareness_hiv_aids        -0.46608    0.08253  -5.647 1.63e-08 ***
## access_to_media            0.01062    0.10367   0.102  0.91843    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6819.3  on 6443  degrees of freedom
## Residual deviance: 5889.2  on 6429  degrees of freedom
## AIC: 5919.2
## 
## Number of Fisher Scoring iterations: 7
# Enhanced model with additional fixed effects
enhanced_model_no_impute <- glm(child_marriage ~ area + region + education_level + ethnicity + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, family = binomial(), data = og_df)

# Summarize the new model with FEs
summary(enhanced_model_no_impute)
## 
## Call:
## glm(formula = child_marriage ~ area + region + education_level + 
##     ethnicity + wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media, family = binomial(), 
##     data = og_df)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.70949    0.23279  -7.343 2.08e-13 ***
## area2                      0.17214    0.10044   1.714  0.08654 .  
## region2                    0.10548    0.16065   0.657  0.51143    
## region3                    0.03594    0.15601   0.230  0.81778    
## region4                   -0.10432    0.17061  -0.611  0.54090    
## region5                    0.06092    0.14434   0.422  0.67298    
## region6                    0.10315    0.14996   0.688  0.49156    
## education_level1           0.12519    0.11676   1.072  0.28362    
## education_level2           0.07838    0.11878   0.660  0.50933    
## education_level3          -0.71860    0.15086  -4.763 1.91e-06 ***
## education_level4          -2.82957    0.72295  -3.914 9.08e-05 ***
## education_level5          -2.61912    0.46855  -5.590 2.27e-08 ***
## ethnicity2                 0.55070    0.13875   3.969 7.21e-05 ***
## ethnicity3                 0.33886    0.14008   2.419  0.01556 *  
## ethnicity4                 1.90113    0.16173  11.755  < 2e-16 ***
## wealth_index2             -0.05571    0.10820  -0.515  0.60665    
## wealth_index3             -0.34294    0.12995  -2.639  0.00831 ** 
## wealth_index4             -0.14989    0.14419  -1.040  0.29857    
## wealth_index5             -0.41552    0.19232  -2.161  0.03073 *  
## health_insurance          -0.07933    0.09623  -0.824  0.40971    
## current_contraceptive_use -0.02792    0.07341  -0.380  0.70370    
## awareness_hiv_aids        -0.08636    0.09460  -0.913  0.36132    
## access_to_media            0.17785    0.11026   1.613  0.10674    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6819.3  on 6443  degrees of freedom
## Residual deviance: 5658.7  on 6421  degrees of freedom
## AIC: 5704.7
## 
## Number of Fisher Scoring iterations: 7

Assessing AICs and BICs of Base Model vs Fixed Effects Model

# Model comparison using AIC
aic_base <- AIC(base_model_no_impute)
aic_enhanced <- AIC(enhanced_model_no_impute)
cat("AIC - Base Model:", aic_base, "\n")
## AIC - Base Model: 5919.176
cat("AIC - Enhanced Model:", aic_enhanced, "\n")
## AIC - Enhanced Model: 5704.682

Additionally, the lower AIC for the model with two fixed effects compared to the baseline model suggests that adding these fixed effects improves the model’s overall fit. This aligns with my theoretical justification for including region and ethnicity as relevant factors in predicting child marriage.

# Model comparison using BIC
bic_base <- BIC(base_model_no_impute)
bic_enhanced <- BIC(enhanced_model_no_impute)
cat("BIC - Base Model:", bic_base, "\n")
## BIC - Base Model: 6020.739
cat("BIC - Enhanced Model:", bic_enhanced, "\n")
## BIC - Enhanced Model: 5860.413

Lower BIC values suggest that, despite the added complexity (more parameters), the model with both region and ethnicity provides a better overall fit to my data when adjusted for the number of predictors. This is a strong indication that these variables are meaningful in explaining the variance in child marriage occurrences in my dataset.

Model with Interaction Terms (Area:Wealth_Index)

# Adding the interaction term between education level and wealth index
interaction_model_no_impute <- update(enhanced_model_no_impute, . ~ . + area:wealth_index)
summary(interaction_model_no_impute)
## 
## Call:
## glm(formula = child_marriage ~ area + region + education_level + 
##     ethnicity + wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media + area:wealth_index, 
##     family = binomial(), data = og_df)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.71245    0.30284  -5.655 1.56e-08 ***
## area2                      0.15095    0.24236   0.623  0.53341    
## region2                    0.12616    0.16152   0.781  0.43477    
## region3                    0.05816    0.15716   0.370  0.71134    
## region4                   -0.08816    0.17115  -0.515  0.60647    
## region5                    0.08191    0.14672   0.558  0.57664    
## region6                    0.12494    0.15100   0.827  0.40799    
## education_level1           0.12367    0.11674   1.059  0.28943    
## education_level2           0.07710    0.11879   0.649  0.51630    
## education_level3          -0.71184    0.15091  -4.717 2.39e-06 ***
## education_level4          -2.79700    0.72341  -3.866  0.00011 ***
## education_level5          -2.58385    0.46919  -5.507 3.65e-08 ***
## ethnicity2                 0.55095    0.13897   3.965 7.35e-05 ***
## ethnicity3                 0.33370    0.14008   2.382  0.01721 *  
## ethnicity4                 1.90109    0.16307  11.658  < 2e-16 ***
## wealth_index2              0.02709    0.27554   0.098  0.92168    
## wealth_index3             -0.35431    0.28892  -1.226  0.22007    
## wealth_index4             -0.24012    0.30088  -0.798  0.42484    
## wealth_index5             -0.66376    0.37279  -1.781  0.07499 .  
## health_insurance          -0.07528    0.09636  -0.781  0.43465    
## current_contraceptive_use -0.02755    0.07345  -0.375  0.70764    
## awareness_hiv_aids        -0.08558    0.09457  -0.905  0.36551    
## access_to_media            0.17709    0.11027   1.606  0.10827    
## area2:wealth_index2       -0.11570    0.29382  -0.394  0.69374    
## area2:wealth_index3        0.01145    0.31297   0.037  0.97081    
## area2:wealth_index4        0.13487    0.32848   0.411  0.68137    
## area2:wealth_index5        0.37698    0.41566   0.907  0.36443    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6819.3  on 6443  degrees of freedom
## Residual deviance: 5656.6  on 6417  degrees of freedom
## AIC: 5710.6
## 
## Number of Fisher Scoring iterations: 7
cat("AIC (enhanced):", AIC(enhanced_model_no_impute), "\nAIC (w/ interaction terms):", AIC(interaction_model_no_impute), "\n")
## AIC (enhanced): 5704.682 
## AIC (w/ interaction terms): 5710.578
cat("BIC (enhanced):", BIC(enhanced_model_no_impute), "\nBIC (w/ interaction terms):", BIC(interaction_model_no_impute), "\n")
## BIC (enhanced): 5860.413 
## BIC (w/ interaction terms): 5893.392
# VIFs check (A VIF value > 5 indicates high multicollinearity)
# Base model
base_no_impute_vif <- vif(base_model_no_impute)
print(base_no_impute_vif)
##                               GVIF Df GVIF^(1/(2*Df))
## area                      1.161113  1        1.077549
## education_level           1.635562  5        1.050429
## wealth_index              1.521114  4        1.053829
## health_insurance          1.028226  1        1.014015
## current_contraceptive_use 1.009489  1        1.004733
## awareness_hiv_aids        1.471075  1        1.212879
## access_to_media           1.212302  1        1.101046
# Enhanced model
enhanced_no_impute_vif <- vif(enhanced_model_no_impute)
print(enhanced_no_impute_vif)
##                               GVIF Df GVIF^(1/(2*Df))
## area                      1.295560  1        1.138227
## region                    5.816954  5        1.192531
## education_level           2.116453  5        1.077856
## ethnicity                 7.824091  3        1.408983
## wealth_index              2.687468  4        1.131535
## health_insurance          1.087063  1        1.042623
## current_contraceptive_use 1.025473  1        1.012656
## awareness_hiv_aids        1.772812  1        1.331470
## access_to_media           1.256323  1        1.120858
# Interacton model
interaction_no_impute_vif <- vif(interaction_model_no_impute)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
print(interaction_no_impute_vif)
##                                 GVIF Df GVIF^(1/(2*Df))
## area                        7.510342  1        2.740500
## region                      6.056756  5        1.197358
## education_level             2.125445  5        1.078313
## ethnicity                   7.978570  3        1.413581
## wealth_index              497.956732  4        2.173446
## health_insurance            1.090010  1        1.044035
## current_contraceptive_use   1.026554  1        1.013190
## awareness_hiv_aids          1.771947  1        1.331145
## access_to_media             1.256367  1        1.120878
## area:wealth_index         466.507434  4        2.155794

Comparing Models With vs Without Imputed Data

# Extracting data from the models
extract_model_data <- function(model) {
  model_summary <- summary(model)
  coeffs <- model_summary$coefficients
  data.frame(
    Term = rownames(coeffs),
    Estimate = sprintf("%.3f", coeffs[, "Estimate"]),
    pValue = ifelse(coeffs[, "Pr(>|z|)"] < 0.001, 
                    format(coeffs[, "Pr(>|z|)"], scientific = TRUE),
                    sprintf("%.3f", coeffs[, "Pr(>|z|)"])),
    Significance = sapply(coeffs[, "Pr(>|z|)"], add_asterisks)
  )
}
# Applying the function to each model
base_impute_data <- extract_model_data(baseline_model)
base_no_impute_data <- extract_model_data(base_model_no_impute)

# Combine the data for comparison
base_comparison_data <- merge(base_impute_data, base_no_impute_data, by = "Term", suffixes = c("_Impute", "_NoImpute"), sort = FALSE)

# View the comparison
print(base_comparison_data)
##                         Term Estimate_Impute pValue_Impute Significance_Impute
## 1                (Intercept)          -0.734  2.502722e-08                 ***
## 2                      area2           0.455  1.245520e-08                 ***
## 3           education_level1          -0.413  2.032253e-06                 ***
## 4           education_level2          -0.587  5.570356e-12                 ***
## 5           education_level3          -1.528  2.806068e-40                 ***
## 6           education_level4          -3.248  2.235120e-10                 ***
## 7           education_level5          -3.195  1.720431e-36                 ***
## 8              wealth_index2          -0.462  7.607170e-09                 ***
## 9              wealth_index3          -0.806  6.235617e-16                 ***
## 10             wealth_index4          -0.614  1.926274e-08                 ***
## 11             wealth_index5          -0.893  2.012553e-09                 ***
## 12          health_insurance           0.069         0.393                    
## 13 current_contraceptive_use           0.401  3.710837e-11                 ***
## 14        awareness_hiv_aids          -0.479  6.047566e-12                 ***
## 15           access_to_media          -0.015         0.857                    
##    Estimate_NoImpute pValue_NoImpute Significance_NoImpute
## 1             -0.129           0.417                      
## 2              0.272           0.004                    **
## 3             -0.442    1.701057e-05                   ***
## 4             -0.512    7.430972e-07                   ***
## 5             -1.273    3.606682e-20                   ***
## 6             -3.394    2.400883e-06                   ***
## 7             -3.178    7.512748e-12                   ***
## 8             -0.614    4.224206e-11                   ***
## 9             -0.929    1.089044e-16                   ***
## 10            -0.752    1.186365e-09                   ***
## 11            -1.023    1.532441e-09                   ***
## 12             0.110           0.236                      
## 13            -0.109           0.120                      
## 14            -0.466    1.630404e-08                   ***
## 15             0.011           0.918
  • Education and Wealth Index variables are consistent, robust and significant predictors of the outcome across both models.
  • For some variables like ‘current contraceptive use’, imputation significantly impacts its role in the model. It could indicate the missing data mechanism’s influence.
  • Significance of Area: This variable’s significance suggests geographical variation is an important factor, but its influence is reduced in the non-imputed model.
  • The overall patterns suggest that while some predictors are consistently influential, others are sensitive to how missing data is handled.
# Applying the function to each model
enhanced_impute_data <- extract_model_data(enhanced_model)
enhanced_no_impute_data <- extract_model_data(enhanced_model_no_impute)

# Combine the data for comparison
enhanced_comparison_data <- merge(enhanced_impute_data, enhanced_no_impute_data, by = "Term", suffixes = c("_Impute", "_NoImpute"), sort = FALSE)

# View the comparison
print(enhanced_comparison_data)
##                         Term Estimate_Impute pValue_Impute Significance_Impute
## 1                (Intercept)          -1.545  6.147348e-18                 ***
## 2                      area2           0.287  7.251287e-04                 ***
## 3                    region2           0.378         0.003                  **
## 4                    region3           0.182         0.154                    
## 5                    region4           0.341         0.007                  **
## 6                    region5          -0.053         0.672                    
## 7                    region6          -0.053         0.692                    
## 8           education_level1          -0.059         0.526                    
## 9           education_level2          -0.226         0.014                   *
## 10          education_level3          -1.172  6.057140e-22                 ***
## 11          education_level4          -2.970  7.510453e-09                 ***
## 12          education_level5          -2.896  1.143797e-29                 ***
## 13                ethnicity2           0.078         0.463                    
## 14                ethnicity3           0.278         0.030                   *
## 15                ethnicity4           1.155  1.603993e-27                 ***
## 16             wealth_index2          -0.125         0.143                    
## 17             wealth_index3          -0.463  1.158858e-05                 ***
## 18             wealth_index4          -0.264         0.024                   *
## 19             wealth_index5          -0.541  7.444127e-04                 ***
## 20          health_insurance          -0.052         0.533                    
## 21 current_contraceptive_use           0.485  1.004528e-14                 ***
## 22        awareness_hiv_aids          -0.194         0.011                   *
## 23           access_to_media          -0.079         0.355                    
##    Estimate_NoImpute pValue_NoImpute Significance_NoImpute
## 1             -1.709    2.081962e-13                   ***
## 2              0.172           0.087                      
## 3              0.105           0.511                      
## 4              0.036           0.818                      
## 5             -0.104           0.541                      
## 6              0.061           0.673                      
## 7              0.103           0.492                      
## 8              0.125           0.284                      
## 9              0.078           0.509                      
## 10            -0.719    1.905736e-06                   ***
## 11            -2.830    9.081803e-05                   ***
## 12            -2.619    2.273156e-08                   ***
## 13             0.551    7.214439e-05                   ***
## 14             0.339           0.016                     *
## 15             1.901    6.687210e-32                   ***
## 16            -0.056           0.607                      
## 17            -0.343           0.008                    **
## 18            -0.150           0.299                      
## 19            -0.416           0.031                     *
## 20            -0.079           0.410                      
## 21            -0.028           0.704                      
## 22            -0.086           0.361                      
## 23             0.178           0.107
  • Education levels and ethnicity (specific categories) are robust across both models.
  • Certain variables like area, region, and wealth index are sensitive to imputation, suggesting that missing data in these areas could be informative.
  • The loss of significance in many variables in the non-imputed model suggests that the missing data may not be random and could be related to the variables’ influence on the outcome.
# Applying the function to each model
interaction_impute_data <- extract_model_data(model_interaction)
interaction_no_impute_data <- extract_model_data(interaction_model_no_impute)

# Combine the data for comparison
interaction_comparison_data <- merge(interaction_impute_data, interaction_no_impute_data, by = "Term", suffixes = c("_Impute", "_NoImpute"), sort = FALSE)

# View the comparison
print(interaction_comparison_data)
##                         Term Estimate_Impute pValue_Impute Significance_Impute
## 1                (Intercept)          -1.785  4.599871e-17                 ***
## 2                      area2           0.546  3.488899e-04                 ***
## 3                    region2           0.365         0.005                  **
## 4                    region3           0.168         0.192                    
## 5                    region4           0.323         0.011                   *
## 6                    region5          -0.068         0.593                    
## 7                    region6          -0.061         0.651                    
## 8           education_level1          -0.061         0.513                    
## 9           education_level2          -0.219         0.018                   *
## 10          education_level3          -1.173  5.833944e-22                 ***
## 11          education_level4          -2.969  7.739192e-09                 ***
## 12          education_level5          -2.896  3.473185e-29                 ***
## 13                ethnicity2           0.062         0.561                    
## 14                ethnicity3           0.273         0.034                   *
## 15                ethnicity4           1.129  4.598972e-26                 ***
## 16             wealth_index2           0.267         0.169                    
## 17             wealth_index3          -0.155         0.464                    
## 18             wealth_index4          -0.042         0.852                    
## 19             wealth_index5          -0.328         0.220                    
## 20          health_insurance          -0.042         0.609                    
## 21 current_contraceptive_use           0.492  4.348509e-15                 ***
## 22        awareness_hiv_aids          -0.187         0.014                   *
## 23           access_to_media          -0.072         0.398                    
## 24       area2:wealth_index2          -0.479         0.026                   *
## 25       area2:wealth_index3          -0.383         0.113                    
## 26       area2:wealth_index4          -0.262         0.306                    
## 27       area2:wealth_index5          -0.250         0.434                    
##    Estimate_NoImpute pValue_NoImpute Significance_NoImpute
## 1             -1.712    1.561885e-08                   ***
## 2              0.151           0.533                      
## 3              0.126           0.435                      
## 4              0.058           0.711                      
## 5             -0.088           0.606                      
## 6              0.082           0.577                      
## 7              0.125           0.408                      
## 8              0.124           0.289                      
## 9              0.077           0.516                      
## 10            -0.712    2.393561e-06                   ***
## 11            -2.797    1.104547e-04                   ***
## 12            -2.584    3.649218e-08                   ***
## 13             0.551    7.354197e-05                   ***
## 14             0.334           0.017                     *
## 15             1.901    2.093819e-31                   ***
## 16             0.027           0.922                      
## 17            -0.354           0.220                      
## 18            -0.240           0.425                      
## 19            -0.664           0.075                      
## 20            -0.075           0.435                      
## 21            -0.028           0.708                      
## 22            -0.086           0.366                      
## 23             0.177           0.108                      
## 24            -0.116           0.694                      
## 25             0.011           0.971                      
## 26             0.135           0.681                      
## 27             0.377           0.364
  • The effects of imputation are most pronounced in geographical variables (area and region) and some interactions, indicating a potential non-random pattern in how missing data occurs in these variables.
  • Certain variables like higher education levels and some ethnicity categories maintain their predictive power regardless of missing data imputation, indicating a stronger and more consistent relationship with the outcome.